Assignment1

Author

Xinran Wang

Settings

suppressPackageStartupMessages(library(knitr))

suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(datasauRus))

suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(ggpubr))

suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(leaflet.extras2))
idir <- "/Users/xinranwang/Documents/Course/25Fall/PM566/Data"

EDA

  1. (30 points) Given the formulated question from the assignment description, you will now conduct EDA Checklist items 1-5. First, download 2002 and 2022 data for all sites in California from the EPA Air Quality Data website, then read the data into R. For each of the two datasets, check the dimensions, headers, footers, variable names and variable types. Check the distribution of the key variable we are analyzing (PM). Write up a summary of all of your findings.

    Summary. This data frame includes measurements at the state level (California), county level (51 counties), and city level (199 monitoring sites). PM2.5 was recorded using 21 different methods (see Method.Description for details) and reported in units of µg/m3 LC.

    read_and_check <- function(idir, filename, year){
      df <- read.csv(paste0(idir, "/", filename)) %>% mutate(Year = year)
      print(paste0("Dimension: ", paste(dim(df), collapse = " x "),
                   "; # Rows: ", nrow(df), 
                   "; # Columns: ", ncol(df)))
    
      print(paste0("Variables names: ", paste(colnames(df), collapse = ", ")))
    
      print("Variable structures: ")
      str(df)
    
      return(df)
    }

    2002

    df_2002 <- read_and_check(idir, "ad_viz_plotval_data_PM2.5_2002.csv", "2002")
    [1] "Dimension: 15976 x 23; # Rows: 15976; # Columns: 23"
    [1] "Variables names: Date, Source, Site.ID, POC, Daily.Mean.PM2.5.Concentration, Units, Daily.AQI.Value, Local.Site.Name, Daily.Obs.Count, Percent.Complete, AQS.Parameter.Code, AQS.Parameter.Description, Method.Code, Method.Description, CBSA.Code, CBSA.Name, State.FIPS.Code, State, County.FIPS.Code, County, Site.Latitude, Site.Longitude, Year"
    [1] "Variable structures: "
    'data.frame':   15976 obs. of  23 variables:
     $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
     $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
     $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
     $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
     $ Daily.Mean.PM2.5.Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
     $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
     $ Daily.AQI.Value               : int  81 93 74 82 98 115 89 62 69 107 ...
     $ Local.Site.Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
     $ Daily.Obs.Count               : int  1 1 1 1 1 1 1 1 1 1 ...
     $ Percent.Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
     $ AQS.Parameter.Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
     $ AQS.Parameter.Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
     $ Method.Code                   : int  120 120 120 120 120 120 120 120 120 120 ...
     $ Method.Description            : chr  "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" "Andersen RAAS2.5-300 PM2.5 SEQ w/WINS" ...
     $ CBSA.Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
     $ CBSA.Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
     $ State.FIPS.Code               : int  6 6 6 6 6 6 6 6 6 6 ...
     $ State                         : chr  "California" "California" "California" "California" ...
     $ County.FIPS.Code              : int  1 1 1 1 1 1 1 1 1 1 ...
     $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
     $ Site.Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
     $ Site.Longitude                : num  -122 -122 -122 -122 -122 ...
     $ Year                          : chr  "2002" "2002" "2002" "2002" ...
    kable(head(df_2002, 1), align = "c")
    Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year
    01/05/2002 AQS 60010007 1 25.1 ug/m3 LC 81 Livermore 1 100 88101 PM2.5 - Local Conditions 120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860 San Francisco-Oakland-Hayward, CA 6 California 1 Alameda 37.68753 -121.7842 2002
    kable(tail(df_2002, 1), align = "c")
    Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year
    15976 12/31/2002 AQS 61131003 1 6 ug/m3 LC 33 Woodland-Gibson Road 1 100 88101 PM2.5 - Local Conditions 117 R & P Model 2000 PM2.5 Sampler w/WINS 40900 Sacramento–Roseville–Arden-Arcade, CA 6 California 113 Yolo 38.66121 -121.7327 2002

    2022

    df_2022 <- read_and_check(idir, "ad_viz_plotval_data_PM2.5_2022.csv", "2022")
    [1] "Dimension: 59918 x 23; # Rows: 59918; # Columns: 23"
    [1] "Variables names: Date, Source, Site.ID, POC, Daily.Mean.PM2.5.Concentration, Units, Daily.AQI.Value, Local.Site.Name, Daily.Obs.Count, Percent.Complete, AQS.Parameter.Code, AQS.Parameter.Description, Method.Code, Method.Description, CBSA.Code, CBSA.Name, State.FIPS.Code, State, County.FIPS.Code, County, Site.Latitude, Site.Longitude, Year"
    [1] "Variable structures: "
    'data.frame':   59918 obs. of  23 variables:
     $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
     $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
     $ Site.ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
     $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
     $ Daily.Mean.PM2.5.Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
     $ Units                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
     $ Daily.AQI.Value               : int  58 60 39 21 23 21 13 38 59 55 ...
     $ Local.Site.Name               : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
     $ Daily.Obs.Count               : int  1 1 1 1 1 1 1 1 1 1 ...
     $ Percent.Complete              : num  100 100 100 100 100 100 100 100 100 100 ...
     $ AQS.Parameter.Code            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
     $ AQS.Parameter.Description     : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
     $ Method.Code                   : int  170 170 170 170 170 170 170 170 170 170 ...
     $ Method.Description            : chr  "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" "Met One BAM-1020 Mass Monitor w/VSCC" ...
     $ CBSA.Code                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
     $ CBSA.Name                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
     $ State.FIPS.Code               : int  6 6 6 6 6 6 6 6 6 6 ...
     $ State                         : chr  "California" "California" "California" "California" ...
     $ County.FIPS.Code              : int  1 1 1 1 1 1 1 1 1 1 ...
     $ County                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
     $ Site.Latitude                 : num  37.7 37.7 37.7 37.7 37.7 ...
     $ Site.Longitude                : num  -122 -122 -122 -122 -122 ...
     $ Year                          : chr  "2022" "2022" "2022" "2022" ...
    kable(head(df_2022, 1), align = "c")
    Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year
    01/01/2022 AQS 60010007 3 12.7 ug/m3 LC 58 Livermore 1 100 88101 PM2.5 - Local Conditions 170 Met One BAM-1020 Mass Monitor w/VSCC 41860 San Francisco-Oakland-Hayward, CA 6 California 1 Alameda 37.68753 -121.7842 2022
    kable(tail(df_2022, 1), align = "c")
    Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year
    59918 12/31/2022 AQS 61131003 1 1 ug/m3 LC 6 Woodland-Gibson Road 1 100 88101 PM2.5 - Local Conditions 145 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900 Sacramento–Roseville–Arden-Arcade, CA 6 California 113 Yolo 38.66121 -121.7327 2022
    identical(colnames(df_2002), colnames(df_2022))
    [1] TRUE

    Note. Here, I combined data frames as they have identical column names.

    df <- rbind(df_2002, df_2022)
    rm(df_2002, df_2022)

    Distribution of PM 2.5

    Summary. There is a striking decrease in daily PM2.5 concentrations from 2002 to 2022. Mean and median PM 2.5 were cut in half. However, there are more outliers in 2022 and max PM 2.5 increased by 3 fold

    df <- df %>% rename(PM2.5 = Daily.Mean.PM2.5.Concentration) 
    df$Year <- factor(df$Year, levels = c("2002", "2022"))
    
    summary1 <- df %>% 
      group_by(Year) %>%
      summarise(
        mean          = round(mean(PM2.5, na.rm = T), 2), 
        sd            = round(sd(PM2.5, na.rm = TRUE), 2),
        min           = min(PM2.5, na.rm = TRUE),
        q25           = quantile(PM2.5, 0.25, na.rm = TRUE),
        median        = median(PM2.5, na.rm = TRUE),
        q75           = quantile(PM2.5, 0.75, na.rm = TRUE),
        max           = max(PM2.5, na.rm = TRUE)
      ) 
    kable(as.data.frame(summary1), align = "c")
    Year mean sd min q25 median q75 max
    2002 16.12 13.87 0.0 7.0 12.0 20.5 104.3
    2022 8.41 7.64 -6.7 4.1 6.8 10.7 302.5
  2. (10 points) Combine the two years of data into one data frame. Use the Date variable to create a new column for year, which will serve as an identifier. Change the names of the key variables so that they are easier to refer to in your code.

    Note. (1) I’ve created new column for year when reading data, (2) I’ve renamed Daily.Mean.PM2.5.Concentration to PM2.5 in previous step (3) I’ve combined data in previous step (before plotting).

  3. (20 points) Create a basic map (or maps) in leaflet showing the locations of the monitoring sites, using different colors for each year. Summarize the spatial distribution of the sites. How does this distribution change from 2002 to 2022?

    Map (2002 vs 2022)

    Summary. By 2022, more sites had been established, frequently clustering near the 2002 sites (see map above). The number of sites increased 3.75-fold from 2002 to 2022 (rising from 15,978 to 59,918 sites; see histrogram and table in Question 4).

    # Map
    #scales::hue_pal()(2)
    pal <- leaflet::colorFactor(c("#F8766D", "#00BFC4"), domain = levels(df$Year))
    
    leaflet(df) %>%
      addMapPane("left",  zIndex = 410) %>% 
      addMapPane("right", zIndex = 410) %>% 
    
      addProviderTiles('OpenStreetMap',
                   options = pathOptions(pane = "left"),
                   layerId = "layer_2002") %>%
      addProviderTiles('OpenStreetMap',
                   options = pathOptions(pane = "right"),
                   layerId = "layer_2022") %>%
    
      addCircles(data = df %>% filter(Year == 2002),
                 lat = ~Site.Latitude,
                 lng = ~Site.Longitude,
                 fillColor = "#F8766D",
                 color = "black",
                 stroke = TRUE,
                 weight = 0.2,
                 opacity = 0.5,
                 fillOpacity = 1, 
                 radius = 100,
                 options = pathOptions(pane = "left"),
                 group = "2002") %>%
      addCircles(data = df %>% filter(Year == 2022),
                 lat = ~Site.Latitude,
                 lng = ~Site.Longitude,
                 fillColor = "#00BFC4",
                 color = "black",
                 stroke = TRUE,
                 weight = 0.2,
                 opacity = 0.5,
                 fillOpacity = 1, 
                 radius = 100, 
                 options = pathOptions(pane = "right"),
                 group = "2022") %>%
    
      addLegend("topright", 
                title = "Location of Monitor Sites",
                pal = pal, values = ~Year, labels = ~Year
                ) %>%
    
      addLayersControl(overlayGroups = ~c("2002", "2022"),
                   options = layersControlOptions(collapsed = F)) %>%
    
      addSidebyside(
          layerId = "sidecontrols",
          leftId = "layer_2002",
          rightId = "layer_2022")
    #https://rdrr.io/cran/leaflet.extras2/man/addSidebyside.html
  4. (10 points) Check for any data issues such as missing or implausible values of PM in the combined dataset. Calculate the proportion of missing/implausible values for each year and report any temporal patterns you see in these observations.

    Missing values. There are no missing values in either year.

    Implausible values. In 2022, 215 out of 59,918 observations (0.36%) were implausible (defined as PM2.5 < 0).

    Temporal Patterns. The number of implausible observations increased in 2022, although the count remained very small overall).

    summary2 <- df %>% 
      group_by(Year) %>%
      summarise(
        `# Total`          = n(),
        `# Missing`        = sum(is.na(PM2.5)),
        `# Implausible`    = sum(PM2.5 < 0, na.rm = TRUE)
      ) %>%
      mutate(    
        `% Missing`     = paste0(round((`# Missing` / `# Total`) * 100, 2), "%"),
        `% Implausible` = paste0(round((`# Implausible` / `# Total`) * 100, 2), "%")) %>%
      select(`# Total`, `# Missing`, `% Missing`, `# Implausible`, `% Implausible`)
    
    kable(as.data.frame(summary2), align = "c")
    # Total # Missing % Missing # Implausible % Implausible
    15976 0 0% 0 0%
    59918 0 0% 215 0.36%
    # Count of PM 2.5 Concentration
    df$Year <- factor(df$Year, levels = c("2022", "2002"))  # 2002 will be drawn last (on top)
    ggplot(df, aes(x = PM2.5, fill = Year, color = Year)) +
      geom_histogram(bins = 100, alpha = 0.5, position = "identity") + 
      labs(title = "Histrogram of Daily PM 2.5 Concentration", 
           x = "PM2.5 Concentration",
           y = "Count") +
      scale_color_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      scale_fill_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      theme_classic()

    df$Year <- factor(df$Year, levels = c("2002", "2022"))  # 2002 will be drawn last (on top)
  5. (30 points) Explore the main question of interest at three different levels of spatial resolution. Create exploratory plots (e.g. boxplots, histograms, line plots, violin plots) and summary statistics that best suit each level of data. Be sure to write up explanations of what you observe in these data.

  • Level 1: State. Examine the primary question for the entire state.

  • Level 2: County. Examine the primary question for every county in California.

  • Level 3: City. Restrict the data to sites in Los Angeles county and examine the primary question for every site.

    Level 1: State

    Summary. The mean of daily PM 2.5 in 2022 (mean: 8.41; sd: 7.64) is lower than than 2002 (mean: 16.12; sd: 13.87), despite more outliers.

    summary3 <- df %>% 
      group_by(Year) %>%
      summarise(
        n             = n(),
        mean          = round(mean(PM2.5, na.rm = T), 2), 
        sd            = round(sd(PM2.5, na.rm = T), 2)) %>%
      mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>%
      select(Year, `Mean (SD)`) %>%
      arrange(Year)
    kable(as.data.frame(summary3), align = "c")
    Year Mean (SD)
    2002 16.12 (13.87)
    2022 8.41 (7.64)
    ggplot(df, aes_string(x = "Year", y = "PM2.5", fill = "Year", color = "Year")) +
      geom_boxplot(color = "black", size = 0.2) +
      labs(title = paste0(" Daily PM2.5 Concentration (2002 vs 2022) in California"),
           x = "Year",
           y = paste0("Daily PM2.5 Concentration")) +
      scale_color_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      theme_minimal() +
      theme(legend.position = "bottom",
            legend.box = "horizontal",
            legend.justification = "center")
    Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
    ℹ Please use tidy evaluation idioms with `aes()`.
    ℹ See also `vignette("ggplot2-in-packages")` for more information.
    Warning: No shared levels found between `names(values)` of the manual scale and the
    data's colour values.

    Level 2: County

    Summary. The mean of daily PM 2.5 for most counties in California in 2022 are lower than that of 2000, except for Trinity, Siskiyou, Mono, and Del Norte.

    summary4 <- df %>% 
      group_by(County, Year) %>%
      summarise(
        n             = n(),
        mean          = round(mean(PM2.5, na.rm = T), 2), 
        sd            = round(sd(PM2.5, na.rm = TRUE), 2)) %>%
      mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>%
      select(Year, County, `Mean (SD)`) %>%
      pivot_wider(
        names_from = Year, 
        values_from = `Mean (SD)`, 
        names_prefix = "Mean (SD) in ") %>%
      select(County, `Mean (SD) in 2002`, `Mean (SD) in 2022`) 
    `summarise()` has grouped output by 'County'. You can override using the
    `.groups` argument.
    kable(as.data.frame(summary4), align = "c")
    County Mean (SD) in 2002 Mean (SD) in 2022
    Alameda 14.25 (11.43) 8.2 (4.95)
    Butte 14.76 (11.66) 6.19 (5.79)
    Calaveras 9.9 (6.5) 6.04 (4.1)
    Colusa 11.73 (10.01) 7.61 (4.76)
    Contra Costa 15.09 (14.47) 8.24 (4.93)
    Del Norte 3.82 (2.56) 4.97 (3.58)
    El Dorado 4.91 (4.07) 4.07 (9.18)
    Fresno 19.93 (18.91) 10.19 (8.3)
    Glenn NA 5.34 (4.98)
    Humboldt 7.79 (4.59) 6.76 (4.35)
    Imperial 12.71 (7.7) 9.55 (6.39)
    Inyo 7.14 (10.49) 5.55 (5.47)
    Kern 23 (18.72) 9.88 (9.91)
    Kings 24.7 (19.06) 14.4 (10.78)
    Lake 6.5 (10.88) 4.26 (3.12)
    Los Angeles 19.66 (11.88) 10.97 (5.24)
    Madera NA 10.39 (7.26)
    Marin 7.31 (4.87) 6.54 (4.17)
    Mariposa 9.48 (5.97) 9.47 (17.78)
    Mendocino 8.85 (8.79) 10.14 (5.19)
    Merced 21.54 (15.32) 10.06 (7.32)
    Modoc 5 (0) NA
    Mono 2.68 (2.57) 4.43 (6.09)
    Monterey 8.97 (4.53) 4.75 (3.21)
    Nevada 7.36 (3.41) 7.59 (15.86)
    Orange 17.83 (10.55) 9.24 (4.43)
    Placer 13.1 (10.24) 7.28 (15.48)
    Plumas 12.86 (8.94) 11.23 (10.33)
    Riverside 22.37 (16.1) 9.13 (6.27)
    Sacramento 15.17 (14.51) 8.25 (7.54)
    San Benito 4.99 (2.92) 4.71 (2.69)
    San Bernardino 16.3 (13.08) 9.42 (5.19)
    San Diego 15.27 (7.65) 8.3 (4.36)
    San Francisco 15.31 (14.21) 6.78 (5.14)
    San Joaquin 16.76 (12.26) 7.9 (7.81)
    San Luis Obispo 8.88 (5.15) 7.1 (4.52)
    San Mateo 12.24 (8.91) 6.8 (4.94)
    Santa Barbara 7.04 (3.92) 6.06 (3.51)
    Santa Clara 14.83 (10.65) 8.77 (5.59)
    Santa Cruz 8.55 (4.34) 5.23 (3.74)
    Shasta 7.25 (8.38) 5.04 (4.67)
    Siskiyou 2.69 (2.06) 7.6 (17.3)
    Solano 15.48 (14.15) 7.9 (5.46)
    Sonoma 11.13 (9.5) 6.38 (4.29)
    Stanislaus 19.66 (17.09) 12.12 (8.42)
    Sutter 12.82 (10.2) 10.04 (6.6)
    Tehama NA 5.87 (4.5)
    Trinity 2.78 (2.4) 10.73 (23.1)
    Tulare 23.1 (16.15) 10.41 (7.53)
    Ventura 13.4 (7.24) 6.85 (3.83)
    Yolo 10.58 (8.86) 5.44 (5.29)
    df_couty <- df %>% group_by(Year, County) %>% 
      summarise(
        n             = n(),
        mean_PM2.5    = mean(PM2.5, na.rm = T), 
        sd_PM2.5      = sd(PM2.5, na.rm = T),
      ) %>%
      mutate(se_PM2.5 = sd_PM2.5 / sqrt(n))
    `summarise()` has grouped output by 'Year'. You can override using the
    `.groups` argument.
    ggplot(df_couty, 
           aes(y = County, x = mean_PM2.5, 
               xmin = mean_PM2.5 - 1.96 * se_PM2.5, 
               xmax = mean_PM2.5 + 1.96 * se_PM2.5, 
               color = Year, group = Year)) +
      geom_pointrange(size = 0.3, fatten = 1.2) +
      labs(title = paste0("Daily PM2.5 Concentration (2002 vs 2022), by County"),
           x = "Daily PM2.5 Concentration (95% CI)", y = "County",) +
      scale_color_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
      theme_minimal() +
      theme(legend.position = "bottom",
            legend.box = "horizontal",
            legend.justification = "center")

    Level 3: Sites in LA County

    Summary. The mean of daily PM 2.5 for all cities in Los Angeles county in 2022 are lower than that of 2000

  summary5 <- df %>% 
           filter(County == "Los Angeles", 
                  Year %in% c("2002", "2022"),
                  Local.Site.Name != "") %>%
      group_by(Year, Local.Site.Name) %>%
      summarise(
        n             = n(),
        mean          = round(mean(PM2.5, na.rm = T), 2), 
        sd            = round(sd(PM2.5, na.rm = TRUE), 2)) %>%
      mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>%
      select(Year, Local.Site.Name, `Mean (SD)`) %>%
      pivot_wider(
        names_from = Year, 
        values_from = `Mean (SD)`, 
        names_prefix = "Mean (SD) in ") %>%
      mutate(`Local Site Name` = Local.Site.Name) %>%
      select(`Local Site Name`, `Mean (SD) in 2002`, `Mean (SD) in 2022`) %>%
      arrange(`Local Site Name`) 
`summarise()` has grouped output by 'Year'. You can override using the
`.groups` argument.
  kable(as.data.frame(summary5), align = "c")
Local Site Name Mean (SD) in 2002 Mean (SD) in 2022
Azusa 20.76 (12.11) 9.72 (4.39)
Burbank 23.97 (12.74) NA
Compton NA 12.99 (6.22)
Glendora NA 8.42 (5.47)
Lancaster-Division Street 10.38 (4.47) 7.52 (2.48)
Lebec 4.82 (2.78) 3.5 (1.65)
Long Beach (North) 19.47 (10.39) 9.92 (3.27)
Long Beach (South) NA 11.97 (4.32)
Long Beach-Route 710 Near Road NA 13.02 (5.61)
Los Angeles-North Main Street 21.97 (11.69) 11.58 (4.57)
Lynwood 23.35 (11.96) NA
North Hollywood (NOHO) NA 13.02 (4.75)
Pasadena 20.29 (11.14) 9.09 (3.68)
Pico Rivera #2 NA 11.45 (5.93)
Reseda 18.85 (10.65) 10.72 (4.56)
Santa Clarita NA 9.14 (3.9)
Signal Hill (LBSH) NA 8.85 (4.42)
  ggplot(df %>% 
           filter(County == "Los Angeles", 
                  Year %in% c("2002", "2022"),
                  Local.Site.Name != ""), 
       aes(x = PM2.5,
           y = Local.Site.Name,
           fill = Year)) +
  geom_violin(alpha = 0.6, scale = "width", 
              position = position_dodge(width = 0.8), trim = FALSE) +
  labs(
    title = "Daily PM2.5 Concentration in LA County",
    subtitle = "By monitoring sites (2002 vs 2022)",
    x = "Daily PM2.5 Concentration",
    y = "Site",
    fill = "Year") +
  scale_color_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
  scale_fill_manual(name = "Year",
                         values = c("2002" = "#F8766D", "2022" = "#00BFC4")) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    legend.box = "horizontal",
    legend.justification = "center"
  )
Warning: No shared levels found between `names(values)` of the manual scale and the
data's colour values.

Reminder: after you upload your final rendered document to GitHub, you should download it to make sure that it looks right! If you haven’t included embed-resources: true in the YAML header, none of your figures will show up!